/**********************************************************************/
/*
   	Author: Michelle Han, Adapted from Nikhil Kumar
	Last Updated: July 2024
   	Description: Cleans raw March 2019 SUSENAS data. Outputs deidentified
	cleaned SUSENAS data merged with PMO data.

	This cleaning file should output 2 different datasets:
	1. Cleaned Susenas data:
	sus_mar19_deid_clean
	2. Subset of Susenas person-batch data matched with PMO:
	sus_mar19_deid_clean_merged

*/
/**********************************************************************/


*******************************************
* Setup
* include filepaths 
  	if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
	cap log close
	global prefix: display %tdCYND td(`c(current_date)')
	log using "$KP_logs/${prefix}_clean_SUSENAS_mar2019.txt", text replace

* Load the data

* Save consumption data in tempfile
	import dbase "$KP_deid_susenas/Raw/Household/dbf/mar19/blok43_diseminasi.dbf", clear

	keep RENUM FOOD NONFOOD

	rename RENUM renum
	rename FOOD food
	rename NONFOOD nonfood

	tempfile cons
	sa `cons'

	use "$KP_deid_susenas/Raw/Individual/SUSENAS_MAR19_deid.dta", clear
	merge m:1 renum using "$KP_deid_susenas/Raw/Household/susenas19mar_rt_diseminasi.dta", keep(1 3) nogen
	merge m:1 renum using `cons', assert(3) nogen

*******************************************
* Recode yes/no questions 

	la def yesno 0 "No" 1 "Yes"

	local yesno_vars r2001* ///

	foreach var of varlist `yesno_vars' {
		assert `var' == 1 | `var' == 5 | `var' == .
		recode `var' (1=1) (5=0)
		replace `var' = 0 if mi(`var') // for questions dependent on previous skip logic
		la val `var' yesno
	}

*******************************************
* Demographics

* City (kotamadya) dummy
	gen kode_kab = r102
	gen city = kode_kab >= 70

* Gender
	gen gender = r405 == 1
	la def gender 1 "Male" 0 "Female"
	la val gender gender

	gen female = r405 == 2
	la def female 1 "Female" 0 "Male"
	la val female female

* Marital status
	gen marital_status = r404

* Bucket age
	gen age = r407
	recode age (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat)

* Dummy for age <=30 
	gen young = 0 
	replace young = 1 if age <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 
	
* Education (translated to years of schooling)
	gen educ = r615
	gen school_years = .
	replace school_years = 6 if inlist(educ, 1, 2, 3, 4) // Paket A, SDLB, SD, MI
	replace school_years = 9 if inlist(educ, 5, 6, 7, 8) // Paket B, SMP LB, SMP, MTs
	replace school_years = 12 if inlist(educ, 9, 10, 11, 12, 13, 14) // Paket C, SMLB, SMA, MA, SMK, MAK
	replace school_years = 13.5 if educ == 15 // Diploma I/II
	replace school_years = 15 if educ == 16 // Diploma III
	replace school_years = 16 if inlist(educ, 17, 18) // Diploma IV / SI
	replace school_years = 18 if inlist(educ, 19, 20) // SII/Profesi
	replace school_years = 20 if educ == 21 // SIII

* Dummy for educated (graduating high school)
	gen educated = school_years > 12 
	la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
	la val educated educated

* Number of HH members
	gen hh_size = r301

* Log per-capita household consumption
	gen log_cons = log(exp_cap)

	gen cons_adj = exp_cap
	sum cons_adj if cons_adj > 0, d
	replace cons_adj = r(p99) if cons_adj > r(p99) & !mi(cons_adj)
	replace cons_adj = r(p1) if cons_adj < r(p1) & !mi(cons_adj)
	sum cons_adj, d
	gen log_adj_cons = log(cons_adj)

	* food per cap
	gen cons_food = food / r301
	gen lcon_food = log(cons_food)

	gen cons_adj_food = cons_food
	sum cons_adj_food, d
	replace cons_adj_food = r(p99) if cons_adj_food > r(p99) & !mi(cons_adj_food)
	replace cons_adj_food = r(p1) if cons_adj_food < r(p1) & !mi(cons_adj_food)
	replace cons_adj_food = cons_adj_food / 1000
	gen lcon_adj_food = log(cons_adj_food)

	* non-food per cap
	gen cons_nfood = nonfood / r301
	gen lcon_nfood = log(cons_nfood)

	gen cons_adj_nfood = cons_nfood
	sum cons_adj_nfood, d
	replace cons_adj_nfood = r(p99) if cons_adj_nfood > r(p99) & !mi(cons_adj_nfood)
	replace cons_adj_nfood = r(p1) if cons_adj_nfood < r(p1) & !mi(cons_adj_nfood)
	replace cons_adj_nfood = cons_adj_nfood / 1000
	gen lcon_adj_nfood = log(cons_adj_nfood)

* Do you (applicant) live in same kabupaten you were born in?
	gen born_kab = r603
	gen born_live_same = born_kab == kode_kab

* Durable goods 
* 5.5 kilos or more LPG canister
	gen dur_canister = r2001a 

* fridge/freezer
	gen dur_fridge = r2001b 

*  air conditioner	
	gen dur_aircon = r2001c 

* water heater 	
	gen dur_waterheat = r2001d

* telephone 
	gen dur_phone = r2001e 

* computer
	gen dur_computer = r2001f 

* gold	
	gen dur_gold = r2001g 

* motorcycle	
	gen dur_bike = r2001h 

* boat	
	gen dur_boat = r2001i

* motorboat	
	gen dur_motorboat = r2001j

* car	 
	gen dur_car = r2001k 

* flat television	
	gen dur_tv = r2001l 

* land	
	gen dur_land = r2001m

* Check
	foreach var of varlist dur_* {
		tab `var', m
		la val `var' yesno
	}

*******************************************
* Telecom

* Used mobile phone in last 3 months
	gen mobile_use = (r801==1)

* Own mobile phone in last 3 months
	gen mobile_ownership = (r802==1)

* Used computer in last 3 months
	gen computer_use = (r803_x=="X")

* In the last 3 months, have you (applicant) accessed internet using PC/laptop/tablet
	gen use_internet_comp = r805_a == "A" | r805_b == "B" | r805_c == "C"

* In the last 3 months, have you (applicant) accessed internet using cellphone
	gen use_internet_phone = r805_d == "D"

* In the last 3 months, have you (applicant) accessed internet?
	gen use_internet = r804 ==1

* why use internet
	gen internet_info = r807_a == "A" | r807_b == "B"
	gen internet_social = r807_c == "C" | r807_d == "D"
	gen internet_buy_sell = r807_e == "E" | r807_f == "F" | r807_i == "I"
	gen internet_entertain = r807_g == "G"
	gen use_internet_bank = r807_h == "H"

*******************************************
* Employment

* Employed in week prior to survey
	gen employed = inlist(r705, 1, 2, 3, 4, 5)

* 706--work status questions during the "final week"
	gen employment_status = r705

	gen self_employed = r705 == 1
	gen business_owner = r705 == 2 | r705 == 3
	gen self_emp_bus_owner = (r705 == 1 | r705 == 2 | r705 == 3)
	gen perm_worker = r705 == 4
	gen freelancer = r705 == 5
	gen family_unpaid = r705 == 6

	label define emp 1 "Self-employed" 2 "Business with Temporary Employees" ///
			3 "Business with Permanent Employees" ///
			4 "Permanent Worker" 5 "Freelancer" 6 "Family/Unpaid Worker", replace
	lab val employment_status emp

	tab employed employment_status, m

* 706, 707- Hours Worked
	gen hrs_worked_main = r706
	replace hrs_worked_main = 0 if employed == 0
	gen hrs_worked_all = r707
	replace hrs_worked_all = 0 if employed == 0	

* Industry
	gen ind_code = r704

* In the last week, did you work or have work or a business but temporarily not working?  (same as SAKARNAS 9A + 10A --can also stack and add a dummy)
	gen temp_not_work = r703
	recode temp_not_work (1=1) (5=0)
	replace temp_not_work = 0 if employed
	tab temp_not_work, m
	tab temp_not_work employment_status, m
	tab temp_not_work employed, m

* Household business owners
	bys renum: egen hh_business = max(business_owner)
	bys renum: egen hh_emp_self = max(self_employed)

* Does anyone in the household own a business with employees (either formal or informal)
	gen with_emp = inlist(r705, 2, 3)
	bys renum: egen with_emp_HH_S = max(with_emp)


*******************************************
* Assets

* Motorbike
	gen motorbike = r2001h == 1

* Gold/jewelry
	gen gold_jewel = r2001g == 1

* Flat screen TV
	gen flat_TV = r2001l == 1

*House characteristics:
*# sqm of the house
	gen hh_area = r1804

*******************************************
* Program Receipt

* 2105: Family Card Hejahtera (KKS)
	gen receive_KKS = (r2105 == 1 | r2105 == 2)
	la val receive_KKS yesno
	tab receive_KKS

* 2109: receive BPNT 
	gen ever_BPNT = r2109 ==1
	tab ever_BPNT

* 2106:  PKH last year
	gen receive_pkh_year = r2106  ==1
	tab receive_pkh_year

* 2107a:  PKH Now
	gen receive_pkh_now = r2107a ==1
	tab receive_pkh_now

* r2111: any assistance from local governments
	gen local_assist = r2111 ==1
	tab local_assist

* r2112a: routine assistance from local governments
	gen r_assist = r2112a ==1
	tab r_assist 

* 2112b: non-routine assistance from local governments
	gen nr_assist = r2112b ==1
	tab nr_assist

******************************************************
* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates 
	drop if anon_id4_dupe != 0 & !missing(anon_id4)
	drop anon_id4_dupe
/*----------------------------------------------------*/
              * Export cleaned data
/*----------------------------------------------------*/

	rename fwt weight
	datasignature 
  if "`r(datasignature)'" == "1204460:553(29841):3638558706:2154947136" {
    save "$KP_deid_susenas/Clean/sus_mar19_deid_clean.dta", replace
      }
  else {
    di as err "Careful, your machine produces a different dataset"
    stop
		}
	

/*----------------------------------------------------*/
          * Merge with admin data and export
/*----------------------------------------------------*/

	gen sus_round = 3
	keep if !mi(anon_id4)
	drop r1* r2* r3* r4* r5* r6* r7* r8* r9*

	* Merge with PMO data
	destring date_incentive, replace force

	merge 1:m anon_id4 using "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", nogen keep(3)

	compress
	datasignature 
  	if "`r(datasignature)'" == "194562:164(54521):1111004167:749032878" {
    	save "$KP_deid_susenas/Clean/sus_mar19_deid_clean_merged.dta", replace
      	}
  	else {
    	di as err "Careful, your machine produces a different dataset"
    	stop
		}
	
